{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "import path_utils  # ai4eutils\n",
    "import sas_blob_utils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import process_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# parkscanada_garrow_201920_trains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'parkscanada_garrow_201920_trains'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/'\n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database\n",
    "\n",
    "No sequence information available."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "api_inputs_dir = '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/parkscanada'\n",
    "\n",
    "api_inputs = path_utils.recursive_file_list(api_inputs_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/mink_disk_0/camtraps/megadetectorv5_annotation_prep/parkscanada/api_inputs/parkscanada-garrow-20210207/parkscanada-garrow-20210207.chunk001.json',\n",
       " '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/parkscanada/api_inputs/parkscanada-garrow-20210207/parkscanada-garrow-20210207.chunk000.json',\n",
       " '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/parkscanada/api_inputs/parkscanada-garrow-20200409/parkscanada-garrow-20200409.chunk000.json']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "api_inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1816046"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_paths = []\n",
    "\n",
    "for p in api_inputs:\n",
    "    with open(p) as f:\n",
    "        li = json.load(f)\n",
    "        all_paths.extend(li)\n",
    "len(all_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_91291images_Feb202020_March212020/100RECNX/RCNX6570.JPG'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'CPRail5_589703_5670051_17_12_2019to29_12_2019/101RECNX/IMG_1660.JPG'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_paths[1000]\n",
    "all_paths[-1000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2019 folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1816046/1816046 [00:03<00:00, 562853.03it/s] \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "218417"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences_2019 = []\n",
    "locations_2019 = set()\n",
    "\n",
    "for p in tqdm(all_paths):\n",
    "    if p.startswith('garrow-2020.') or not path_utils.is_image_file(p):\n",
    "        continue\n",
    "        \n",
    "    p_parts = p.split('/')\n",
    "    railway_loc = '2019_' + p_parts[0].split('_')[0]\n",
    "    locations_2019.add(railway_loc)\n",
    "    \n",
    "    sequences_2019.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': f'dummy_2019_{len(sequences_2019)}',\n",
    "        'location': railway_loc,\n",
    "        'class': ['train'],\n",
    "        'images': [{\n",
    "            'file': p,\n",
    "            'frame_num': 1,\n",
    "        }]\n",
    "    })\n",
    "    \n",
    "len(sequences_2019)\n",
    "len(locations_2019)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'2019_CPRail1',\n",
       " '2019_CPRail2',\n",
       " '2019_CPRail3',\n",
       " '2019_CPRail4',\n",
       " '2019_CPRail5'}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "locations_2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'dataset': 'parkscanada_garrow_201920_trains',\n",
       "  'seq_id': 'dummy_2019_138479',\n",
       "  'location': '2019_CPRail3',\n",
       "  'class': ['train'],\n",
       "  'images': [{'file': 'CPRail3_593114_5669476_13_11_2019to22_11_2019/101RECNX/IMG_1371.JPG',\n",
       "    'frame_num': 1}]}]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample(sequences_2019, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2020 folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1816046/1816046 [00:15<00:00, 118761.22it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1597625"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences_2020 = []\n",
    "locations_2020 = set()\n",
    "\n",
    "for p in tqdm(all_paths):\n",
    "    if not p.startswith('garrow-2020.') or not path_utils.is_image_file(p):\n",
    "        continue\n",
    "        \n",
    "    p_parts = p.split('/')\n",
    "    railway_loc = '2020_' + p_parts[2].split('_')[0]\n",
    "    locations_2020.add(railway_loc)\n",
    "    \n",
    "    sequences_2020.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': f'dummy_2020_{len(sequences_2020)}',\n",
    "        'location': railway_loc,\n",
    "        'class': ['train'],\n",
    "        'images': [{\n",
    "            'file': p,\n",
    "            'frame_num': 1,\n",
    "        }]\n",
    "    })\n",
    "    \n",
    "len(sequences_2020)\n",
    "len(locations_2020)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'2020_CPRail1',\n",
       " '2020_CPRail2',\n",
       " '2020_CPRail3',\n",
       " '2020_CPRail4',\n",
       " '2020_CPRail5'}"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "locations_2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'dataset': 'parkscanada_garrow_201920_trains',\n",
       "  'seq_id': 'dummy_2020_7034',\n",
       "  'location': '2020_CPRail3',\n",
       "  'class': ['train'],\n",
       "  'images': [{'file': 'garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_91291images_Feb202020_March212020/101RECNX/RCNX2605.JPG',\n",
       "    'frame_num': 1}]}]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample(sequences_2020, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1816042"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences = sequences_2019 + sequences_2020\n",
    "len(sequences)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 3min 40s, sys: 103 ms, total: 3min 40s\n",
      "Wall time: 3min 40s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2b - Sample\n",
    "\n",
    "Only taking a random sample of 25k images.\n",
    "\n",
    "We will also only include this sample in the megadb."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences_sample = sample(sequences, 25000)\n",
    "\n",
    "locations_sampled = set()\n",
    "for seq in sequences_sample:\n",
    "    locations_sampled.add(seq['location'])\n",
    "len(locations_sampled)  # covering all 10 locations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sequences_sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences_sample, f, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2c - Download the sampled images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_to_download = []\n",
    "\n",
    "for seq in sequences_sample:\n",
    "    for im in seq['images']:\n",
    "        list_to_download.append(im['file'] + '\\n')\n",
    "len(list_to_download)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_56019images_Sept102020_Oct102020/105RECNX/RCNX2952.JPG\\n'"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_to_download[10000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/parkscanada_trains_files.txt', 'w') as f:\n",
    "    f.writelines(list_to_download)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2d - Copy to flat folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 25000/25000 [00:00<00:00, 77210.24it/s]\n"
     ]
    }
   ],
   "source": [
    "path_pairs = []\n",
    "\n",
    "for seq in tqdm(sequences_sample):\n",
    "    seq_id = seq['seq_id']\n",
    "    \n",
    "    for im in seq['images']:\n",
    "        frame = im['frame_num']\n",
    "    \n",
    "        src_path = os.path.join(container_root, im['file'])\n",
    "        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12g', \n",
    "                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "        path_pairs.append((src_path, dst_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "[('/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/CPRail4_591659_5669241_17_12_2019to28_12_2019/101RECNX/IMG_4496.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/parkscanada_garrow_201920_trains.seqdummy_2019_182929.frame1.jpg'),\n",
       " ('/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/garrow-2020.12.30/Garrow2021ParksCanada/CPRail2_594233_5669466_16881images_June172020_July152020/100RECNX/IMG_8751.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/parkscanada_garrow_201920_trains.seqdummy_2020_878340.frame1.jpg'),\n",
       " ('/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_59172images_April92020_May82020/103RECNX/RCNX0172.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/parkscanada_garrow_201920_trains.seqdummy_2020_1357262.frame1.jpg'),\n",
       " ('/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_64565images_May82020_June172020/100RECNX/RCNX2125.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/parkscanada_garrow_201920_trains.seqdummy_2020_1388923.frame1.jpg'),\n",
       " ('/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/CPRail2_594249_5669442_13_11_2019to19_11_2019/100RECNX/IMG_4587.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/parkscanada_garrow_201920_trains.seqdummy_2019_40156.frame1.jpg')]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)\n",
    "sample(path_pairs, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 32.2 s, sys: 32.6 s, total: 1min 4s\n",
      "Wall time: 29.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "def copy_file(src_path, dst_path):\n",
    "    return copyfile(src_path, dst_path)\n",
    "\n",
    "with ThreadPool(12) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dst_paths)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
